*Dataset with singles*

*Load data*
clear
use "Data\sample1_new.dta"

*Keep only relevant variables
keep pnr aar alder wage_growth_ambition final_educ wage_start_mean_ambition grad_region educ_eika fined civst faelle_nr koen aegte_nr individual erhvervsindk_13


forvalues i=81(1)85{
	
*9th grade
gen temp_g=wage_growth_ambition if final_educ==1109`i'
egen temp_g_2=max(temp_g)
replace wage_growth_ambition=temp_g_2 if (final_educ==1007 | final_educ==1008 | final_educ==1023 | final_educ==1123 | final_educ==1009 | final_educ==1022) & grad_region==`i' 
drop temp_g temp_g_2

gen temp_w=wage_start_mean_ambition if final_educ==1109`i'
egen temp_w_2=max(temp_w)
replace wage_start_mean_ambition=temp_w_2 if (final_educ==1007 | final_educ==1008 | final_educ==1023 | final_educ==1123 | final_educ==1009 | final_educ==1022) & grad_region==`i' 
drop temp_w temp_w_2

replace final_educ=1107`i' if final_educ==1107 & grad_region==`i'
replace final_educ=1008`i' if final_educ==1008 & grad_region==`i'
replace final_educ=1023`i' if final_educ==1023 & grad_region==`i'
replace final_educ=1123`i' if final_educ==1123 & grad_region==`i'
replace final_educ=1009`i' if final_educ==1009 & grad_region==`i'
replace final_educ=1022`i' if final_educ==1022 & grad_region==`i'

*10th grade
gen temp_g=wage_growth_ambition if final_educ==1110`i'
egen temp_g_2=max(temp_g)
replace wage_growth_ambition=temp_g_2 if final_educ==1010 & grad_region==`i' 
drop temp_g temp_g_2

gen temp_w=wage_start_mean_ambition if final_educ==1110`i'
egen temp_w_2=max(temp_w)
replace wage_start_mean_ambition=temp_w_2 if final_educ==1010 & grad_region==`i' 
drop temp_w temp_w_2

replace final_educ=1010`i' if final_educ==1010 & grad_region==`i'

}

*3.g 
gen temp_g=wage_growth_ambition if final_educ==1198
egen temp_g_2=max(temp_g)
replace wage_growth_ambition=temp_g_2 if final_educ==1097
drop temp_g temp_g_2

gen temp_w=wage_start_mean_ambition if final_educ==1198
egen temp_w_2=max(temp_w)
replace wage_start_mean_ambition=temp_w_2 if final_educ==1097
drop temp_w temp_w_2

**kmeans**

*standardize variables
sum wage_start_mean_ambition
sca the_mean_s=r(mean)
sca the_sd_s=r(sd)
gen wage_start_mean_ambition_s=(wage_start_mean_ambition-the_mean_s)/the_sd_s
sum wage_start_mean_ambition_s

sum wage_growth_ambition
sca the_mean_g=r(mean)
sca the_sd_g=r(sd)
gen wage_growth_ambition_s=(wage_growth_ambition-the_mean_g)/the_sd_g
sum wage_growth_ambition_s

cluster kmeans wage_start_mean_ambition_s wage_growth_ambition_s, k(4) name(ambition_type_k_4_s) s(kr(1234))
tab ambition_type_k_4_s
tabstat wage_start_mean_ambition_s wage_growth_ambition_s, by(ambition_type_k_4_s)

tab ambition_type_k_4_s fined, row

*Define marital status
gen relationship=0
replace relationship=1 if civst=="G" | (civst!="G" & faelle_nr!="")

gen married=0
replace married=1 if civst=="G"

gen cohab=0
replace cohab=1 if relationship==1 & married==0

*Make couple ID based on the PNR of the man
gen couple_id="."
replace couple_id=pnr if koen=="1" & relationship==1
replace couple_id=aegte_nr if koen=="2" & married==1
replace couple_id=faelle_nr if koen=="2" & cohab==1
sort couple_id aar

*Keep only couples where we observe both partners
gen temp2=koen if relationship==1
destring temp2, replace
by couple_id aar: egen temp3=mean(temp2)
keep if (temp3>1 & temp3<2 & relationship==1) | relationship==0
drop temp2 temp3


sort pnr aar

**Keep only those for who we observe ambition type**
keep if wage_growth_ambition!=.

*Keep only couples where we observe both partners
gen temp2=koen if relationship==1
destring temp2, replace
by couple_id aar, sort: egen temp3=mean(temp2)
keep if (temp3==1.5 & relationship==1) | relationship==0
drop temp2 temp3

sort pnr aar

*Save*
save "Data\dataset_with_singles.dta", replace

**Merge on fields of study ex-post**
clear
use "Data\dataset_with_singles.dta"

sort pnr aar

merge 1:1 pnr aar using "Data\field_of_study.dta"

drop if _merge==2
drop _merge


*Now make fields categorization
gen ps_field="Education and Humanities" if substr(educ_field_narrow,1,2)=="02" | substr(educ_field_narrow,1,2)=="01"
replace ps_field="Social Science" if substr(educ_field_narrow,1,2)=="03" | educ_field_narrow=="042" /*group law with social science*/
replace ps_field="Business" if educ_field_narrow=="041"
replace ps_field="STEM" if substr(educ_field_narrow,1,2)=="05" | substr(educ_field_narrow,1,2)=="06" | substr(educ_field_narrow,1,2)=="07"
replace ps_field="Health and Welfare" if substr(educ_field_narrow,1,2)=="09"
replace ps_field="Other" if ps_field=="" & (fined==3 | fined==4) /*Includes agriculture, forestry, fisheries, veterinary, services, security, and unkown*/

gen educ_level_field="Primary" if fined==1
replace educ_level_field="Secondary" if fined==2
replace educ_level_field="Education and Humanities" if (fined==3 | fined==4) & ps_field=="Education and Humanities"
replace educ_level_field="Social Science" if (fined==3 | fined==4) & ps_field=="Social Science"
replace educ_level_field="Business" if (fined==3 | fined==4) & ps_field=="Business"
replace educ_level_field="STEM" if (fined==3 | fined==4) & ps_field=="STEM"
replace educ_level_field="Health and Welfare" if (fined==3 | fined==4) & ps_field=="Health and Welfare"
replace educ_level_field="Other" if (fined==3 | fined==4) & ps_field=="Other"

tab educ_level_field

gen educ_level_field_num=1 if educ_level_field=="Primary"
replace educ_level_field_num=2 if educ_level_field=="Secondary"
replace educ_level_field_num=3 if educ_level_field=="Education and Humanities"
replace educ_level_field_num=4 if educ_level_field=="Social Science"
replace educ_level_field_num=5 if educ_level_field=="Business"
replace educ_level_field_num=6 if educ_level_field=="STEM"
replace educ_level_field_num=7 if educ_level_field=="Health and Welfare"
replace educ_level_field_num=8 if educ_level_field=="Other"

tab educ_level_field_num

drop hfaudd educ_field educ_field_narrow ps_field 

*Save*
save "Data\dataset_with_singles.dta", replace

